In [2]:
import pandas as pd
import matplotlib.pyplot as plt

ratings = pd.read_csv("FilmData2.csv")
In [3]:
ratings.head()
Out[3]:
Hulu Netflix Amazon DisneyPlus type title release_year region language tconst ... primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres tconst-1 averageRating numVotes
0 NaN NaN NaN NaN NaN NaN NaN US \N tt4048280 ... RED WINDOWS Red Windows 0 2016 \N NaN Thriller tt4048280 2.5 21
1 NaN NaN NaN NaN NaN NaN NaN \N \N tt4048280 ... RED WINDOWS Red Windows 0 2016 \N NaN Thriller tt4048280 2.5 21
2 NaN NaN NaN NaN NaN NaN NaN GB \N tt3335210 ... WHAT GOES UP What Goes Up 0 2014 \N 87.0 Comedy,Romance,Sci-Fi tt3335210 2.5 23
3 NaN NaN NaN NaN NaN NaN NaN \N \N tt3335210 ... WHAT GOES UP What Goes Up 0 2014 \N 87.0 Comedy,Romance,Sci-Fi tt3335210 2.5 23
4 NaN NaN NaN NaN NaN NaN NaN US \N tt3335210 ... WHAT GOES UP What Goes Up 0 2014 \N 87.0 Comedy,Romance,Sci-Fi tt3335210 2.5 23

5 rows × 21 columns

In [4]:
vals = ratings[(ratings['titleType']=='movie') & (ratings['genres']!='\\N') & (ratings['region']=='US')] 
vals = vals[vals['numVotes']>100]
In [5]:
vals.head()
Out[5]:
Hulu Netflix Amazon DisneyPlus type title release_year region language tconst ... primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres tconst-1 averageRating numVotes
47 NaN NaN NaN NaN NaN NaN NaN US \N tt4412218 ... RUNNING MAN Benpao Ba! Xiongdi 0 2015 \N 88.0 Action,Comedy,Documentary tt4412218 2.5 115
55 NaN NaN NaN NaN NaN NaN NaN US \N tt4253170 ... THE APOSTLES Gui zhen 0 2013 \N 91.0 Fantasy,Horror,Sci-Fi tt4253170 2.5 133
58 NaN NaN NaN NaN NaN NaN NaN US \N tt4679576 ... BLOODHOUND Bloodhound 0 2020 \N 72.0 Crime,Drama tt4679576 2.5 135
65 NaN NaN NaN NaN NaN NaN NaN US \N tt4591226 ... THE LAST HOUSE The Last House 0 2015 \N 91.0 Horror tt4591226 2.5 155
67 NaN NaN NaN NaN NaN NaN NaN US \N tt2757592 ... A TALKING PONY!?! A Talking Pony!?! 0 2013 \N 88.0 Comedy,Family,Fantasy tt2757592 2.5 184

5 rows × 21 columns

In [6]:
genres = []

for genre in vals["genres"]:
    # split all
    _temp = genre.split(",")
    for item in _temp:
        if item not in genres:
            genres.append(item)
In [7]:
genres
Out[7]:
['Action',
 'Comedy',
 'Documentary',
 'Fantasy',
 'Horror',
 'Sci-Fi',
 'Crime',
 'Drama',
 'Family',
 'Thriller',
 'Mystery',
 'Adventure',
 'Animation',
 'Romance',
 'Sport',
 'Music',
 'Biography',
 'Western',
 'War',
 'Reality-TV',
 'History',
 'Musical',
 'News',
 'Talk-Show',
 'Adult',
 'Short']
In [8]:
df = vals['averageRating']
In [9]:
df.shape
Out[9]:
(57592,)
In [10]:
import plotly 
import plotly.express as px
In [11]:
fig = px.histogram(data_frame=vals, x=vals["averageRating"], title="IMDB Scores of the Programs")
fig.show()
In [12]:
fig = px.box(data_frame=vals, x=vals["averageRating"])
fig.update_traces(overwrite=False)
fig.show()
In [13]:
vals[["averageRating", "runtimeMinutes"]].corr()
Out[13]:
averageRating runtimeMinutes
averageRating 1.000000 0.025417
runtimeMinutes 0.025417 1.000000
In [14]:
top_10_ratings = vals[["averageRating", "title", "genres", "startYear"]].sort_values(["averageRating"], ascending=False)[:10]
top_10_ratings
fig = px.scatter(top_10_ratings, y= 'title', x='averageRating', hover_data = top_10_ratings[['genres','startYear']], color='genres', 
                 title = "Top 10 High Rated Programs")
fig.show()
In [ ]:
 
In [25]:
vals[["averageRating", "startYear"]].corr()
Out[25]:
averageRating startYear
averageRating 1.000000 -0.016972
startYear -0.016972 1.000000
In [116]:
plotdata = pd.DataFrame(vals["averageRating"])
In [117]:
vals.groupby('startYear', as_index=False)['averageRating'].mean()

fig = px.bar(data_frame=vals, x=vals["startYear"],y=vals["averageRating"])
fig.show()
In [22]:
df2 = vals.groupby('startYear', as_index=False)['averageRating'].mean()
In [ ]:
 
In [23]:
fig = px.bar(data_frame=df2, x=df2["startYear"],y=df2["averageRating"])
fig.show()
In [15]:
df3 = vals.groupby('genres', as_index=False)['averageRating'].mean()
fig = px.bar(data_frame=df3, x=df3["genres"],y=df3["averageRating"])
fig.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: